Port ruby data_generator.rb to Julia (#258)

author Claire Foster <chris42f@gmail.com>

Wed, 3 Jan 2024 17:02:08 +0000 (03:02 +1000)

committer GitHub <noreply@github.com>

Wed, 3 Jan 2024 17:02:08 +0000 (12:02 -0500)
author Claire Foster <chris42f@gmail.com>
Wed, 3 Jan 2024 17:02:08 +0000 (03:02 +1000)
committer GitHub <noreply@github.com>
Wed, 3 Jan 2024 17:02:08 +0000 (12:02 -0500)
diff --git a/Makefile b/Makefile

index 806f441b792621f3faabef147cadbcf8a0e36ffd..ab03f743fe491dd7b1f2c98a53414d2b0e3d0175 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -73,7 +73,7 @@ manifest: MANIFEST.new
  
  # real targets
  
-data/utf8proc_data.c.new: libutf8proc.$(SHLIB_EXT) data/data_generator.rb data/charwidths.jl
+data/utf8proc_data.c.new: libutf8proc.$(SHLIB_EXT) data/data_generator.jl
         $(MAKE) -C data utf8proc_data.c.new
  
  utf8proc.o: utf8proc.h utf8proc.c utf8proc_data.c
diff --git a/data/Makefile b/data/Makefile

index 1c5830ef00fe09e8373ae99470726ece622abaad..484c44bd4fd5fcb68cef179ff5b4eb1e9a06beb7 100644 (file)
--- a/data/Makefile
+++ b/data/Makefile
@@ -1,11 +1,10 @@
  # Unicode data generation rules.  Except for the test data files, most
  # users will not use these Makefile rules, which are primarily to re-generate
  # unicode_data.c when we get a new Unicode version or charwidth data; they
-# require ruby and julia to be installed.
+# require julia to be installed.
  
  # programs
  CURL=curl
-RUBY=ruby
  PERL=perl
  MAKE=make
  JULIA=julia
@@ -15,11 +14,11 @@ CURLFLAGS = --retry 5 --location
  
  .DELETE_ON_ERROR:
  
-utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt CharWidths.txt emoji-data.txt
-       $(RUBY) data_generator.rb < UnicodeData.txt > $@
+RAWDATA = UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt EastAsianWidth.txt emoji-data.txt
  
-CharWidths.txt: charwidths.jl EastAsianWidth.txt
-       $(JULIA) charwidths.jl > $@
+utf8proc_data.c.new: data_generator.jl $(RAWDATA)
+       $(JULIA) --project=. -e 'using Pkg; Pkg.instantiate()'
+       $(JULIA) --project=. data_generator.jl > $@
  
  # Unicode data version (must also update utf8proc_unicode_version function)
  UNICODE_VERSION=15.1.0
@@ -52,12 +51,12 @@ emoji-data.txt:
         $(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://unicode.org/Public/$(UNICODE_VERSION)/ucd/emoji/emoji-data.txt
  
  Uppercase.txt: DerivedCoreProperties.txt
-       $(RUBY) -e 'puts File.read("DerivedCoreProperties.txt")[/# Derived Property: Uppercase.*?# Total code points:/m]' > $@
+       $(JULIA) -e 'print(match(r"# Derived Property: Uppercase.*?# Total code points:"s, read("DerivedCoreProperties.txt", String)).match)' > $@
  
  Lowercase.txt: DerivedCoreProperties.txt
-       $(RUBY) -e 'puts File.read("DerivedCoreProperties.txt")[/# Derived Property: Lowercase.*?# Total code points:/m]' > $@
+       $(JULIA) -e 'print(match(r"# Derived Property: Lowercase.*?# Total code points:"s, read("DerivedCoreProperties.txt", String)).match)' > $@
  
  clean:
-       rm -f UnicodeData.txt EastAsianWidth.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt emoji-data.txt
+       rm -f $(RAWDATA) NormalizationTest.txt GraphemeBreakTest.txt
         rm -f Uppercase.txt Lowercase.txt
         rm -f utf8proc_data.c.new
diff --git a/data/Manifest.toml b/data/Manifest.toml

new file mode 100644 (file)

index 0000000..d397411
--- /dev/null
+++ b/data/Manifest.toml
@@ -0,0 +1,69 @@
+# This file is machine-generated - editing it directly is not advised
+
+julia_version = "1.9.3"
+manifest_format = "2.0"
+project_hash = "bc0740aa2247b17bd49ba693fb87f41bbbddead6"
+
+[[deps.Adapt]]
+deps = ["LinearAlgebra", "Requires"]
+git-tree-sha1 = "cde29ddf7e5726c9fb511f340244ea3481267608"
+uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
+version = "3.7.2"
+
+    [deps.Adapt.extensions]
+    AdaptStaticArraysExt = "StaticArrays"
+
+    [deps.Adapt.weakdeps]
+    StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
+
+[[deps.Artifacts]]
+uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
+
+[[deps.CompilerSupportLibraries_jll]]
+deps = ["Artifacts", "Libdl"]
+uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
+version = "1.0.5+0"
+
+[[deps.Libdl]]
+uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
+
+[[deps.LinearAlgebra]]
+deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"]
+uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+
+[[deps.OffsetArrays]]
+deps = ["Adapt"]
+git-tree-sha1 = "2ac17d29c523ce1cd38e27785a7d23024853a4bb"
+uuid = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
+version = "1.12.10"
+
+[[deps.OpenBLAS_jll]]
+deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
+uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
+version = "0.3.21+4"
+
+[[deps.Random]]
+deps = ["SHA", "Serialization"]
+uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+
+[[deps.Requires]]
+deps = ["UUIDs"]
+git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7"
+uuid = "ae029012-a4dd-5104-9daa-d747884805df"
+version = "1.3.0"
+
+[[deps.SHA]]
+uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
+version = "0.7.0"
+
+[[deps.Serialization]]
+uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
+
+[[deps.UUIDs]]
+deps = ["Random", "SHA"]
+uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
+
+[[deps.libblastrampoline_jll]]
+deps = ["Artifacts", "Libdl"]
+uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
+version = "5.8.0+0"
diff --git a/data/Project.toml b/data/Project.toml

new file mode 100644 (file)

index 0000000..af0961e
--- /dev/null
+++ b/data/Project.toml
@@ -0,0 +1,2 @@
+[deps]
+OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
diff --git a/data/charwidths.jl b/data/charwidths.jl

deleted file mode 100644 (file)

index 1346217..0000000
--- a/data/charwidths.jl
+++ /dev/null
@@ -1,169 +0,0 @@
-# Following work by @jiahao, we compute character widths using a combination of
-#   * character category
-#   * UAX 11: East Asian Width
-#   * a few exceptions as needed
-# Adapted from http://nbviewer.ipython.org/gist/jiahao/07e8b08bf6d8671e9734
-#
-# We used to also use data from GNU Unifont, but that has proven unreliable
-# and unlikely to match widths assumed by terminals.
-#
-# Requires Julia (obviously) and FontForge.
-
-#############################################################################
-CharWidths = Dict{Int,Int}()
-
-#############################################################################
-# Use ../libutf8proc for category codes, rather than the one in Julia,
-# to minimize bootstrapping complexity when a new version of Unicode comes out.
-catcode(c) = ccall((:utf8proc_category,"../libutf8proc"), Cint, (Int32,), c)
-
-# utf8proc category constants (must match h)
-const UTF8PROC_CATEGORY_CN = 0
-const UTF8PROC_CATEGORY_LU = 1
-const UTF8PROC_CATEGORY_LL = 2
-const UTF8PROC_CATEGORY_LT = 3
-const UTF8PROC_CATEGORY_LM = 4
-const UTF8PROC_CATEGORY_LO = 5
-const UTF8PROC_CATEGORY_MN = 6
-const UTF8PROC_CATEGORY_MC = 7
-const UTF8PROC_CATEGORY_ME = 8
-const UTF8PROC_CATEGORY_ND = 9
-const UTF8PROC_CATEGORY_NL = 10
-const UTF8PROC_CATEGORY_NO = 11
-const UTF8PROC_CATEGORY_PC = 12
-const UTF8PROC_CATEGORY_PD = 13
-const UTF8PROC_CATEGORY_PS = 14
-const UTF8PROC_CATEGORY_PE = 15
-const UTF8PROC_CATEGORY_PI = 16
-const UTF8PROC_CATEGORY_PF = 17
-const UTF8PROC_CATEGORY_PO = 18
-const UTF8PROC_CATEGORY_SM = 19
-const UTF8PROC_CATEGORY_SC = 20
-const UTF8PROC_CATEGORY_SK = 21
-const UTF8PROC_CATEGORY_SO = 22
-const UTF8PROC_CATEGORY_ZS = 23
-const UTF8PROC_CATEGORY_ZL = 24
-const UTF8PROC_CATEGORY_ZP = 25
-const UTF8PROC_CATEGORY_CC = 26
-const UTF8PROC_CATEGORY_CF = 27
-const UTF8PROC_CATEGORY_CS = 28
-const UTF8PROC_CATEGORY_CO = 29
-
-#############################################################################
-# Use a default width of 1 for all character categories that are
-# letter/symbol/number-like, as well as for unassigned/private-use chars.
-# This can be overridden by UAX 11
-# below, but provides a useful nonzero fallback for new codepoints when
-# a new Unicode version has been released but Unifont hasn't been updated yet.
-
-zerowidth = Set{Int}() # categories that may contain zero-width chars
-push!(zerowidth, UTF8PROC_CATEGORY_MN)
-push!(zerowidth, UTF8PROC_CATEGORY_MC)
-push!(zerowidth, UTF8PROC_CATEGORY_ME)
-# push!(zerowidth, UTF8PROC_CATEGORY_SK)  # see issue #167
-push!(zerowidth, UTF8PROC_CATEGORY_ZL)
-push!(zerowidth, UTF8PROC_CATEGORY_ZP)
-push!(zerowidth, UTF8PROC_CATEGORY_CC)
-push!(zerowidth, UTF8PROC_CATEGORY_CF)
-push!(zerowidth, UTF8PROC_CATEGORY_CS)
-for c in 0x0000:0x110000
-    if catcode(c) ∉ zerowidth
-        CharWidths[c] = 1
-    end
-end
-
-#############################################################################
-# Widths from UAX #11: East Asian Width
-#   .. these take precedence for all codepoints
-#      listed explicitly as wide/full/narrow/half-width
-
-for line in readlines(open("EastAsianWidth.txt"))
-    #Strip comments
-    (isempty(line) || line[1] == '#') && continue
-    precomment = split(line, '#')[1]
-    #Parse code point range and width code
-    tokens = split(precomment, ';')
-    length(tokens) >= 2 || continue
-    charrange = tokens[1]
-    width = strip(tokens[2])
-    #Parse code point range into Julia UnitRange
-    rangetokens = split(charrange, "..")
-    charstart = parse(UInt32, "0x"*rangetokens[1])
-    charend = parse(UInt32, "0x"*rangetokens[length(rangetokens)>1 ? 2 : 1])
-
-    #Assign widths
-    for c in charstart:charend
-        if width=="W" || width=="F" # wide or full
-            CharWidths[c]=2
-        elseif width=="Na"|| width=="H"
-            CharWidths[c]=1
-        end
-    end
-end
-
-#############################################################################
-# A few exceptions to the above cases, found by manual comparison
-# to other wcwidth functions and similar checks.
-
-for c in keys(CharWidths)
-    cat = catcode(c)
-
-    # make sure format control character (category Cf) have width 0
-    # (some of these, like U+0601, can have a width in some cases
-    #  but normally act like prepended combining marks.  U+fff9 etc
-    #  are also odd, but have zero width in typical terminal contexts)
-    if cat==UTF8PROC_CATEGORY_CF
-        CharWidths[c]=0
-    end
-
-    # Unifont has nonzero width for a number of non-spacing combining
-    # characters, e.g. (in 7.0.06): f84,17b4,17b5,180b,180d,2d7f, and
-    # the variation selectors
-    if cat==UTF8PROC_CATEGORY_MN
-        CharWidths[c]=0
-    end
-
-    # We also assign width of one to unassigned and private-use
-    # codepoints (Unifont includes ConScript Unicode Registry PUA fonts,
-    # but since these are nonstandard it seems questionable to use Unifont metrics;
-    # if they are printed as the replacement character U+FFFD they will have width 1).
-    if cat==UTF8PROC_CATEGORY_CO || cat==UTF8PROC_CATEGORY_CN
-        CharWidths[c]=1
-    end
-
-    # for some reason, Unifont has width-2 glyphs for ASCII control chars
-    if cat==UTF8PROC_CATEGORY_CC
-        CharWidths[c]=0
-    end
-end
-
-#Soft hyphen is typically printed as a hyphen (-) in terminals.
-CharWidths[0x00ad]=1
-
-#By definition, should have zero width (on the same line)
-#0x002028 ' ' category: Zl name: LINE SEPARATOR/
-#0x002029 ' ' category: Zp name: PARAGRAPH SEPARATOR/
-CharWidths[0x2028]=0
-CharWidths[0x2029]=0
-
-#############################################################################
-# Output (to a file or pipe) for processing by data_generator.rb,
-# encoded as a sequence of intervals.
-
-firstc = 0x000000
-lastv = 0
-uhex(c) = uppercase(string(c,base=16,pad=4))
-for c in 0x0000:0x110000
-    global firstc, lastv
-    v = get(CharWidths, c, 0)
-    if v != lastv || c == 0x110000
-        v < 4 || error("invalid charwidth $v for $c")
-        if firstc+1 < c
-            println(uhex(firstc), "..", uhex(c-1), "; ", lastv)
-        else
-            println(uhex(firstc), "; ", lastv)
-        end
-        firstc = c
-        lastv = v
-    end
-end
diff --git a/data/data_generator.jl b/data/data_generator.jl

new file mode 100644 (file)

index 0000000..deecb3b
--- /dev/null
+++ b/data/data_generator.jl
@@ -0,0 +1,559 @@
+using OffsetArrays: Origin
+
+parsehex(str) = parse(UInt32, str, base=16)
+
+function parse_hex_range(line)
+    m = match(r"^([0-9A-F]+)(\.\.([0-9A-F]+))? +; +([^#]+)", line)
+    if isnothing(m)
+        return nothing
+    end
+    i = parsehex(m[1])
+    j = !isnothing(m[3]) ? parsehex(m[3]) : i
+    desc = rstrip(m[4])
+    return (i:j, desc)
+end
+
+function read_hex_ranges(filename)
+    [r for r in parse_hex_range.(readlines(filename)) if !isnothing(r)]
+end
+
+function collect_codepoints(range_desc, description)
+    list = UInt32[]
+    for (r,d) in range_desc
+        if d == description
+            append!(list, r)
+        end
+    end
+    list
+end
+
+function set_all!(d, keys, value)
+    for k in keys
+        d[k] = value
+    end
+end
+
+#-------------------------------------------------------------------------------
+
+derived_core_properties = read_hex_ranges("DerivedCoreProperties.txt")
+
+ignorable = Set(collect_codepoints(derived_core_properties, "Default_Ignorable_Code_Point"))
+uppercase = Set(collect_codepoints(derived_core_properties, "Uppercase"))
+lowercase = Set(collect_codepoints(derived_core_properties, "Lowercase"))
+
+
+#-------------------------------------------------------------------------------
+function derive_indic_conjunct_break(derived_core_properties)
+    props = Dict{UInt32, String}()
+    set_all!(props, collect_codepoints(derived_core_properties, "InCB; Linker"),    "LINKER")
+    set_all!(props, collect_codepoints(derived_core_properties, "InCB; Consonant"), "CONSONANT")
+    set_all!(props, collect_codepoints(derived_core_properties, "InCB; Extend"),    "EXTEND")
+    props
+end
+
+let indic_conjunct_break = derive_indic_conjunct_break(derived_core_properties)
+    global function get_indic_conjunct_break(code)
+        get(indic_conjunct_break, code, "NONE")
+    end
+end
+
+#-------------------------------------------------------------------------------
+function read_grapheme_boundclasses(grapheme_break_filename, emoji_data_filename)
+    grapheme_boundclass = Dict{UInt32, String}()
+    for (r,desc) in read_hex_ranges(grapheme_break_filename)
+        set_all!(grapheme_boundclass, r, Base.uppercase(desc))
+    end
+    for (r,desc) in read_hex_ranges(emoji_data_filename)
+        if desc == "Extended_Pictographic"
+            set_all!(grapheme_boundclass, r, "EXTENDED_PICTOGRAPHIC")
+        elseif desc == "Emoji_Modifier"
+            set_all!(grapheme_boundclass, r, "EXTEND")
+        end
+    end
+    return grapheme_boundclass
+end
+
+let grapheme_boundclasses = read_grapheme_boundclasses("GraphemeBreakProperty.txt", "emoji-data.txt")
+    global function get_grapheme_boundclass(code)
+        get(grapheme_boundclasses, code, "OTHER")
+    end
+end
+
+#-------------------------------------------------------------------------------
+function read_composition_exclusions(pattern)
+    section = match(pattern, read("CompositionExclusions.txt",String)).match
+    es = UInt32[]
+    for line in split(section, '\n')
+        m = match(r"^([0-9A-F]+) +#"i, line)
+        if !isnothing(m)
+            push!(es, parsehex(m[1]))
+        end
+    end
+    es
+end
+
+exclusions = Set(read_composition_exclusions(r"# \(1\) Script Specifics.*?# Total code points:"s))
+excl_version = Set(read_composition_exclusions(r"# \(2\) Post Composition Version precomposed characters.*?# Total code points:"s))
+
+# FIXME: Replicate a bug in the ruby code
+push!(exclusions, 0)
+push!(excl_version, 0)
+
+#-------------------------------------------------------------------------------
+function read_case_folding(filename)
+    case_folding = Dict{UInt32,Vector{UInt32}}()
+    for line in readlines(filename)
+        m = match(r"^([0-9A-F]+); [CF]; ([0-9A-F ]+);"i, line)
+        !isnothing(m) || continue
+        case_folding[parsehex(m[1])] = parsehex.(split(m[2]))
+    end
+    case_folding
+end
+
+let case_folding = read_case_folding("CaseFolding.txt")
+    global function get_case_folding(code)
+        get(case_folding, code, nothing)
+    end
+end
+
+#-------------------------------------------------------------------------------
+# Utilities for reading per-char properties from UnicodeData.txt
+function split_unicode_data_line(line)
+    m = match(r"""
+      ([0-9A-F]+);        # code
+      ([^;]+);            # name
+      ([A-Z]+);           # general category
+      ([0-9]+);           # canonical combining class
+      ([A-Z]+);           # bidi class
+      (<([A-Z]*)>)?       # decomposition type
+      ((\ ?[0-9A-F]+)*);  # decompomposition mapping
+      ([0-9]*);           # decimal digit
+      ([0-9]*);           # digit
+      ([^;]*);            # numeric
+      ([YN]*);            # bidi mirrored
+      ([^;]*);            # unicode 1.0 name
+      ([^;]*);            # iso comment
+      ([0-9A-F]*);        # simple uppercase mapping
+      ([0-9A-F]*);        # simple lowercase mapping
+      ([0-9A-F]*)$        # simple titlecase mapping
+    """ix, line)
+    @assert !isnothing(m)
+    code = parse(UInt32, m[1], base=16)
+    (code             = code,
+     name             = m[2],
+     category         = m[3],
+     combining_class  = parse(Int, m[4]),
+     bidi_class       = m[5],
+     decomp_type      = m[7],
+     decomp_mapping   = m[8] == "" ? nothing : parsehex.(split(m[8])),
+     bidi_mirrored    = m[13] == "Y",
+     # issue #130: use nonstandard uppercase ß -> ẞ
+     # issue #195: if character is uppercase but has no lowercase mapping,
+     #             then make lowercase mapping = itself (vice versa for lowercase)
+     uppercase_mapping = m[16] != ""                      ? parsehex(m[16]) :
+                         code  == 0x000000df              ? 0x00001e9e      :
+                         m[17] == "" && code in lowercase ? code            :
+                         nothing,
+     lowercase_mapping = m[17] != ""                      ? parsehex(m[17]) :
+                         m[16] == "" && code in uppercase ? code            :
+                         nothing,
+     titlecase_mapping = m[18] != ""         ? parsehex(m[18]) :
+                         code  == 0x000000df ? 0x00001e9e      :
+                         nothing,
+    )
+end
+
+function read_unicode_data(filename)
+    raw_char_props = split_unicode_data_line.(readlines(filename))
+    char_props = Origin(0)(Vector{eltype(raw_char_props)}())
+    @assert issorted(raw_char_props, by=c->c.code)
+    raw_char_props = Iterators.Stateful(raw_char_props)
+    while !isempty(raw_char_props)
+        c = popfirst!(raw_char_props)
+        if occursin(", First>", c.name)
+            nc = popfirst!(raw_char_props)
+            @assert occursin(", Last>", nc.name)
+            name = replace(c.name, ", First"=>"")
+            for i in c.code:nc.code
+                push!(char_props, (; c..., name=name, code=i))
+            end
+        else
+            push!(char_props, c)
+        end
+    end
+    return char_props
+end
+
+char_props = read_unicode_data("UnicodeData.txt")
+char_hash = Dict(c.code=>c for c in char_props)
+
+#-------------------------------------------------------------------------------
+# Read character widths from UAX #11: East Asian Width
+function read_east_asian_widths(filename)
+    ea_widths = Dict{UInt32,Int}()
+    for (rng,widthcode) in read_hex_ranges(filename)
+        w = widthcode == "W" || widthcode == "F" ? 2 : # wide or full
+            widthcode == "Na"|| widthcode == "H" ? 1 : # narrow or half-width
+            nothing
+        if !isnothing(w)
+            set_all!(ea_widths, rng, w)
+        end
+    end
+    return ea_widths
+end
+
+let ea_widths = read_east_asian_widths("EastAsianWidth.txt")
+    # Following work by @jiahao, we compute character widths using a combination of
+    #   * character category
+    #   * UAX 11: East Asian Width
+    #   * a few exceptions as needed
+    # Adapted from http://nbviewer.ipython.org/gist/jiahao/07e8b08bf6d8671e9734
+    global function derive_char_width(code, category)
+        # Use a default width of 1 for all character categories that are
+        # letter/symbol/number-like, as well as for unassigned/private-use chars.
+        # This provides a useful nonzero fallback for new codepoints when a new
+        # Unicode version has been released.
+        width = 1
+
+        # Various zero-width categories
+        #
+        # "Sk" not included in zero width - see issue #167
+        if category in ("Mn", "Mc", "Me", "Zl", "Zp", "Cc", "Cf", "Cs")
+            width = 0
+        end
+
+        # Widths from UAX #11: East Asian Width
+        eaw = get(ea_widths, code, nothing)
+        if !isnothing(eaw)
+            width = eaw
+        end
+
+        # A few exceptional cases, found by manual comparison to other wcwidth
+        # functions and similar checks.
+        if category == "Mn"
+            width = 0
+        end
+
+        if code == 0x00ad
+            # Soft hyphen is typically printed as a hyphen (-) in terminals.
+            width = 1
+        elseif code == 0x2028 || code == 0x2029
+            #By definition, should have zero width (on the same line)
+            #0x002028 ' ' category: Zl name: LINE SEPARATOR/
+            #0x002029 ' ' category: Zp name: PARAGRAPH SEPARATOR/
+            width = 0
+        end
+
+        return width
+    end
+end
+
+#-------------------------------------------------------------------------------
+# Construct data tables which will drive libutf8proc
+#
+# These tables are "compressed" with an ad-hoc compression scheme (largely some
+# simple deduplication and indexing) which can easily and efficiently be
+# decompressed on the C side at runtime.
+
+# Inverse decomposition mapping tables for combining two characters into a single one.
+comb1st_indices = Dict{UInt32,Int}()
+comb1st_indices_sorted_keys = Origin(0)(UInt32[])
+comb2nd_indices = Dict{UInt32,Int}()
+comb2nd_indices_sorted_keys = Origin(0)(UInt32[])
+comb2nd_indices_nonbasic = Set{UInt32}()
+comb_array = Origin(0)(Vector{Dict{Int,UInt32}}())
+for char in char_props
+    if isnothing(char.decomp_type) && !isnothing(char.decomp_mapping) &&
+            length(char.decomp_mapping) == 2 && !isnothing(char_hash[char.decomp_mapping[1]]) &&
+            char_hash[char.decomp_mapping[1]].combining_class == 0 &&
+            char.code ∉ exclusions
+        dm0 = char.decomp_mapping[1]
+        dm1 = char.decomp_mapping[2]
+        if !haskey(comb1st_indices, dm0)
+            comb1st_indices[dm0] = length(comb1st_indices)
+            push!(comb1st_indices_sorted_keys, dm0)
+            push!(comb_array, Dict{Int,UInt32}())
+            @assert length(comb1st_indices) == length(comb_array)
+        end
+        if !haskey(comb2nd_indices, dm1)
+            push!(comb2nd_indices_sorted_keys, dm1)
+            comb2nd_indices[dm1] = length(comb2nd_indices)
+        end
+        @assert !haskey(comb_array[comb1st_indices[dm0]], comb2nd_indices[dm1])
+        comb_array[comb1st_indices[dm0]][comb2nd_indices[dm1]] = char.code
+        if char.code > 0xFFFF
+            push!(comb2nd_indices_nonbasic, dm1)
+        end
+    end
+end
+
+comb_indices = Dict{UInt32,Int}()
+comb1st_indices_lastoffsets = Origin(0)(zeros(Int, length(comb1st_indices)))
+comb1st_indices_firstoffsets = Origin(0)(zeros(Int, length(comb1st_indices)))
+let
+    cumoffset = 0
+    for dm0 in comb1st_indices_sorted_keys
+        index = comb1st_indices[dm0]
+        first = nothing
+        last = nothing
+        offset = 0
+        for b in eachindex(comb2nd_indices_sorted_keys)
+            dm1 = comb2nd_indices_sorted_keys[b]
+            if haskey(comb_array[index], b)
+                if isnothing(first)
+                    first = offset
+                end
+                last = offset
+                if dm1 in comb2nd_indices_nonbasic
+                    last += 1
+                end
+            end
+            offset += 1
+            if dm1 in comb2nd_indices_nonbasic
+                offset += 1 
+            end
+        end
+        comb1st_indices_firstoffsets[index] = first
+        comb1st_indices_lastoffsets[index] = last
+        @assert !haskey(comb_indices, dm0)
+        comb_indices[dm0] = cumoffset
+        cumoffset += last - first + 1 + 2
+    end
+
+    offset = 0
+    for dm1 in comb2nd_indices_sorted_keys
+        @assert !haskey(comb_indices, dm1)
+        comb_indices[dm1] = 0x8000 | (comb2nd_indices[dm1] + offset)
+        @assert comb2nd_indices[dm1] + offset <= 0x4000
+        if dm1 in comb2nd_indices_nonbasic
+            comb_indices[dm1] |= 0x4000
+            offset += 1
+        end
+    end
+end
+
+utf16_encode(utf32_seq) = transcode(UInt16, transcode(String, utf32_seq))
+
+# Utility for packing all UTF-16 encoded sequences into one big array
+struct UTF16Sequences
+    storage::Vector{UInt16}
+    indices::Dict{Vector{UInt16},Int}
+end
+UTF16Sequences() = UTF16Sequences(UInt16[], Dict{Vector{UInt16},Int}())
+
+"""
+Return "sequence code" (seqindex in the C code) for a sequence: a UInt16 where
+* The 14 low bits are the index into the `sequences.storage` array where the
+  sequence resides
+* The two top bits are the length of the sequence, or if equal to 3, the first
+  entry of the sequence itself contains the length.
+"""
+function encode_sequence!(sequences::UTF16Sequences, utf32_seq::Vector)
+    if length(utf32_seq) == 0
+        return typemax(UInt16)
+    end
+    # lencode contains the length of the UTF-32 sequence after decoding
+    # No sequence has len 0, so we encode len 1 as 0, len 2 as 1.
+    # We have only 2 bits for the length, though, so longer sequences are
+    # encoded in the sequence data itself.
+    seq_lencode = length(utf32_seq) - 1
+    utf16_seq = utf16_encode(utf32_seq)
+    idx = get!(sequences.indices, utf16_seq) do
+        i = length(sequences.storage)
+        utf16_seq_enc = seq_lencode < 3 ? utf16_seq :
+                        pushfirst!(copy(utf16_seq), seq_lencode)
+        append!(sequences.storage, utf16_seq_enc)
+        i
+    end
+    @assert idx <= 0x3FFF
+    seq_code = idx | (min(seq_lencode, 3) << 14)
+    return seq_code
+end
+
+function encode_sequence!(sequences::UTF16Sequences, code::Integer)
+    encode_sequence!(sequences, [code])
+end
+
+function encode_sequence!(sequences::UTF16Sequences, ::Nothing)
+    return typemax(UInt16)
+end
+
+function char_table_properties!(sequences, char)
+    code = char.code
+
+    return (
+        category             = char.category,
+        combining_class      = char.combining_class,
+        bidi_class           = char.bidi_class,
+        decomp_type          = char.decomp_type,
+        decomp_seqindex      = encode_sequence!(sequences, char.decomp_mapping),
+        casefold_seqindex    = encode_sequence!(sequences, get_case_folding(code)),
+        uppercase_seqindex   = encode_sequence!(sequences, char.uppercase_mapping),
+        lowercase_seqindex   = encode_sequence!(sequences, char.lowercase_mapping),
+        titlecase_seqindex   = encode_sequence!(sequences, char.titlecase_mapping),
+        comb_index           = get(comb_indices, code, typemax(UInt16)),
+        bidi_mirrored        = char.bidi_mirrored,
+        comp_exclusion       = code in exclusions || code in excl_version,
+        ignorable            = code in ignorable,
+        control_boundary     = char.category in ("Zl", "Zp", "Cc", "Cf") &&
+                               # FIXME: Ruby bug compat - should be `code in (0x200C, 0x200D)`
+                               !(char.category in (0x200C, 0x200D)),
+        charwidth            = derive_char_width(code, char.category),
+        boundclass           = get_grapheme_boundclass(code),
+        indic_conjunct_break = get_indic_conjunct_break(code),
+    )
+end
+
+# Many character properties are duplicates. Deduplicate them, constructing a
+# per-character array of indicies into the properties array
+sequences = UTF16Sequences()
+
+# FIXME: Hack to force ordering compat with Ruby code
+for c in char_props
+    encode_sequence!(sequences, c.decomp_mapping)
+    encode_sequence!(sequences, get_case_folding(c.code))
+end
+
+char_table_props = [char_table_properties!(sequences, cp) for cp in char_props]
+
+deduplicated_props = Origin(0)(Vector{eltype(char_table_props)}())
+char_property_indices = Origin(0)(zeros(Int, 0x00110000))
+let index_map = Dict{eltype(char_table_props),Int}()
+    for (char, table_props) in zip(char_props, char_table_props)
+        entry_idx = get!(index_map, table_props) do
+            idx = length(deduplicated_props)
+            push!(deduplicated_props, table_props)
+            idx
+        end
+        # Add 1 because unassigned codes occupy slot at index 0
+        char_property_indices[char.code] = entry_idx + 1
+    end
+end
+
+# Now compress char_property_indices by breaking it into pages and
+# deduplicating those (this works as compression because there are large
+# contiguous ranges of code space with identical properties)
+prop_page_indices = Int[]
+prop_pages = Int[]
+let
+    page_size = 0x100
+    page_index_map = Dict{Vector{Int}, Int}()
+    for page in Iterators.partition(char_property_indices, page_size)
+        page_idx = get!(page_index_map, page) do
+            idx = length(prop_pages)
+            append!(prop_pages, page)
+            idx
+        end
+        push!(prop_page_indices, page_idx)
+    end
+end
+
+#-------------------------------------------------------------------------------
+function write_c_index_array(io, array, linelen)
+    print(io, "{\n  ")
+    i = 0
+    for x in array
+        i += 1
+        if i == linelen
+            i = 0
+            print(io, "\n  ")
+        end
+        print(io, x, ", ")
+    end
+    print(io, "};\n\n")
+end
+
+function c_enum_name(prefix, str)
+    if isnothing(str)
+        return "0"
+    else
+        return "UTF8PROC_$(prefix)_$(Base.uppercase(str))"
+    end
+end
+
+function c_uint16(seqindex)
+    if seqindex == typemax(UInt16)
+        return "UINT16_MAX"
+    else
+        return string(seqindex)
+    end
+end
+
+function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, deduplicated_props,
+                             comb1st_indices_firstoffsets, comb1st_indices_lastoffsets,
+                             comb2nd_indices_sorted_keys, comb_array, comb2nd_indices_nonbasic)
+    print(io, "static const utf8proc_uint16_t utf8proc_sequences[] = ")
+    write_c_index_array(io, sequences.storage, 8)
+    print(io, "static const utf8proc_uint16_t utf8proc_stage1table[] = ")
+    write_c_index_array(io, prop_page_indices, 8)
+    print(io, "static const utf8proc_uint16_t utf8proc_stage2table[] = ")
+    write_c_index_array(io, prop_pages, 8)
+
+    print(io, """
+        static const utf8proc_property_t utf8proc_properties[] = {
+          {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX,  false,false,false,false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},
+        """)
+    for prop in deduplicated_props
+        print(io, "  {",
+              c_enum_name("CATEGORY", prop.category), ", ",
+              prop.combining_class, ", ",
+              c_enum_name("BIDI_CLASS", prop.bidi_class), ", ",
+              c_enum_name("DECOMP_TYPE", prop.decomp_type), ", ",
+              c_uint16(prop.decomp_seqindex), ", ",
+              c_uint16(prop.casefold_seqindex), ", ",
+              c_uint16(prop.uppercase_seqindex), ", ",
+              c_uint16(prop.lowercase_seqindex), ", ",
+              c_uint16(prop.titlecase_seqindex), ", ",
+              c_uint16(prop.comb_index), ", ",
+              prop.bidi_mirrored, ", ",
+              prop.comp_exclusion, ", ",
+              prop.ignorable, ", ",
+              prop.control_boundary, ", ",
+              prop.charwidth, ", ",
+              "0, ", # bitfield padding
+              c_enum_name("BOUNDCLASS", prop.boundclass), ", ",
+              c_enum_name("INDIC_CONJUNCT_BREAK", prop.indic_conjunct_break),
+              "},\n"
+        )
+    end
+    print(io, "};\n\n")
+
+    print(io, "static const utf8proc_uint16_t utf8proc_combinations[] = {\n  ")
+    i = 0
+    for a in eachindex(comb1st_indices_firstoffsets)
+        offset = 0
+        print(io, comb1st_indices_firstoffsets[a], ", ", comb1st_indices_lastoffsets[a], ", ")
+        for b in eachindex(comb2nd_indices_sorted_keys)
+            dm1 = comb2nd_indices_sorted_keys[b]
+            if offset > comb1st_indices_lastoffsets[a]
+                break
+            end
+            if offset >= comb1st_indices_firstoffsets[a]
+                i += 1
+                if i == 8
+                    i = 0
+                    print(io, "\n  ")
+                end
+                v = get(comb_array[a], b, 0)
+                if dm1 in comb2nd_indices_nonbasic
+                    print(io, (v & 0xFFFF0000) >> 16, ", ")
+                end
+                print(io, v & 0xFFFF, ", ")
+            end
+            offset += 1
+            if dm1 in comb2nd_indices_nonbasic
+                offset += 1
+            end
+        end
+        print(io, "\n")
+    end
+    print(io, "};\n\n")
+end
+
+
+if !isinteractive()
+    print_c_data_tables(stdout, sequences, prop_page_indices, prop_pages, deduplicated_props,
+                        comb1st_indices_firstoffsets, comb1st_indices_lastoffsets,
+                        comb2nd_indices_sorted_keys, comb_array, comb2nd_indices_nonbasic)
+end
+
diff --git a/data/data_generator.rb b/data/data_generator.rb

deleted file mode 100644 (file)

index 91cc03d..0000000
--- a/data/data_generator.rb
+++ /dev/null
@@ -1,475 +0,0 @@
-#!/usr/bin/env ruby
-
-#  This file was used to generate the 'unicode_data.c' file by parsing the
-#  Unicode data file 'UnicodeData.txt' of the Unicode Character Database.
-#  It is included for informational purposes only and not intended for
-#  production use.
-
-
-#  Copyright (c) 2018 Steven G. Johnson, Tony Kelman, Keno Fischer,
-#                Benito van der Zander, Michaël Meyer, and other contributors.
-#  Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
-#
-#  Permission is hereby granted, free of charge, to any person obtaining a
-#  copy of this software and associated documentation files (the "Software"),
-#  to deal in the Software without restriction, including without limitation
-#  the rights to use, copy, modify, merge, publish, distribute, sublicense,
-#  and/or sell copies of the Software, and to permit persons to whom the
-#  Software is furnished to do so, subject to the following conditions:
-#
-#  The above copyright notice and this permission notice shall be included in
-#  all copies or substantial portions of the Software.
-#
-#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-#  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-#  DEALINGS IN THE SOFTWARE.
-
-
-#  This file contains derived data from a modified version of the
-#  Unicode data files. The following license applies to that data:
-#
-#  COPYRIGHT AND PERMISSION NOTICE
-#
-#  Copyright (c) 1991-2007 Unicode, Inc. All rights reserved. Distributed
-#  under the Terms of Use in http://www.unicode.org/copyright.html.
-#
-#  Permission is hereby granted, free of charge, to any person obtaining a
-#  copy of the Unicode data files and any associated documentation (the "Data
-#  Files") or Unicode software and any associated documentation (the
-#  "Software") to deal in the Data Files or Software without restriction,
-#  including without limitation the rights to use, copy, modify, merge,
-#  publish, distribute, and/or sell copies of the Data Files or Software, and
-#  to permit persons to whom the Data Files or Software are furnished to do
-#  so, provided that (a) the above copyright notice(s) and this permission
-#  notice appear with all copies of the Data Files or Software, (b) both the
-#  above copyright notice(s) and this permission notice appear in associated
-#  documentation, and (c) there is clear notice in each modified Data File or
-#  in the Software as well as in the documentation associated with the Data
-#  File(s) or Software that the data or software has been modified.
-#
-#  THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
-#  KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-#  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
-#  THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
-#  INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR
-#  CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
-#  USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-#  TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-#  PERFORMANCE OF THE DATA FILES OR SOFTWARE.
-#
-#  Except as contained in this notice, the name of a copyright holder shall
-#  not be used in advertising or otherwise to promote the sale, use or other
-#  dealings in these Data Files or Software without prior written
-#  authorization of the copyright holder.
-
-
-$ignorable_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Derived Property: Default_Ignorable_Code_Point.*?# Total code points:/m]
-$ignorable = []
-$ignorable_list.each_line do |entry|
-  if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
-    $1.hex.upto($2.hex) { |e2| $ignorable << e2 }
-  elsif entry =~ /^[0-9A-F]+/
-    $ignorable << $&.hex
-  end
-end
-
-$uppercase_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Derived Property: Uppercase.*?# Total code points:/m]
-$uppercase = []
-$uppercase_list.each_line do |entry|
-  if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
-    $1.hex.upto($2.hex) { |e2| $uppercase << e2 }
-  elsif entry =~ /^[0-9A-F]+/
-    $uppercase << $&.hex
-  end
-end
-
-$lowercase_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Derived Property: Lowercase.*?# Total code points:/m]
-$lowercase = []
-$lowercase_list.each_line do |entry|
-  if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
-    $1.hex.upto($2.hex) { |e2| $lowercase << e2 }
-  elsif entry =~ /^[0-9A-F]+/
-    $lowercase << $&.hex
-  end
-end
-
-$icb_linker_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Indic_Conjunct_Break=Linker.*?# Total code points:/m]
-$icb = Hash.new("UTF8PROC_INDIC_CONJUNCT_BREAK_NONE")
-$icb_linker_list.each_line do |entry|
-  if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
-    $1.hex.upto($2.hex) { |e2| $icb[e2] = "UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER" }
-  elsif entry =~ /^[0-9A-F]+/
-    $icb[$&.hex] = "UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER"
-  end
-end
-$icb_consonant_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Indic_Conjunct_Break=Consonant.*?# Total code points:/m]
-$icb_consonant_list.each_line do |entry|
-  if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
-    $1.hex.upto($2.hex) { |e2| $icb[e2] = "UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT" }
-  elsif entry =~ /^[0-9A-F]+/
-    $icb[$&.hex] = "UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT"
-  end
-end
-$icb_extend_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Indic_Conjunct_Break=Extend.*?# Total code points:/m]
-$icb_extend_list.each_line do |entry|
-  if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
-    $1.hex.upto($2.hex) { |e2| $icb[e2] = "UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND" }
-  elsif entry =~ /^[0-9A-F]+/
-    $icb[$&.hex] = "UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND"
-  end
-end
-
-$grapheme_boundclass_list = File.read("GraphemeBreakProperty.txt", :encoding => 'utf-8')
-$grapheme_boundclass = Hash.new("UTF8PROC_BOUNDCLASS_OTHER")
-$grapheme_boundclass_list.each_line do |entry|
-  if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*([A-Za-z_]+)/
-    $1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_" + $3.upcase }
-  elsif entry =~ /^([0-9A-F]+)\s*;\s*([A-Za-z_]+)/
-    $grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_" + $2.upcase
-  end
-end
-
-$emoji_data_list = File.read("emoji-data.txt", :encoding => 'utf-8')
-$emoji_data_list.each_line do |entry|
-  if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Extended_Pictographic\W/
-    $1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC" }
-  elsif entry =~ /^([0-9A-F]+)\s*;\s*Extended_Pictographic\W/
-    $grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC"
-  elsif entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Emoji_Modifier\W/
-    $1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_EXTEND" }
-  elsif entry =~ /^([0-9A-F]+)\s*;\s*Emoji_Modifier\W/
-    $grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_EXTEND"
-  end
-end
-
-$charwidth_list = File.read("CharWidths.txt", :encoding => 'utf-8')
-$charwidth = Hash.new(0)
-$charwidth_list.each_line do |entry|
-  if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*([0-9]+)/
-    $1.hex.upto($2.hex) { |e2| $charwidth[e2] = $3.to_i }
-  elsif entry =~ /^([0-9A-F]+)\s*;\s*([0-9]+)/
-    $charwidth[$1.hex] = $2.to_i
-  end
-end
-
-$exclusions = File.read("CompositionExclusions.txt", :encoding => 'utf-8')[/# \(1\) Script Specifics.*?# Total code points:/m]
-$exclusions = $exclusions.chomp.split("\n").collect { |e| e.hex }
-
-$excl_version = File.read("CompositionExclusions.txt", :encoding => 'utf-8')[/# \(2\) Post Composition Version precomposed characters.*?# Total code points:/m]
-$excl_version = $excl_version.chomp.split("\n").collect { |e| e.hex }
-
-$case_folding_string = File.read("CaseFolding.txt", :encoding => 'utf-8')
-$case_folding = {}
-$case_folding_string.chomp.split("\n").each do |line|
-  next unless line =~ /([0-9A-F]+); [CF]; ([0-9A-F ]+);/i
-  $case_folding[$1.hex] = $2.split(" ").collect { |e| e.hex }
-end
-
-$int_array = []
-$int_array_indicies = {}
-
-def str2c(string, prefix)
-  return "0" if string.nil?
-  return "UTF8PROC_#{prefix}_#{string.upcase}"
-end
-def pushary(array)
-  idx = $int_array_indicies[array]
-  unless idx
-    $int_array_indicies[array] = $int_array.length
-    idx = $int_array.length
-    array.each { |entry| $int_array << entry }
-  end
-  return idx
-end
-def cpary2utf16encoded(array)
-  return array.flat_map { |cp|
-      if (cp <= 0xFFFF)
-        raise "utf-16 code: #{cp}" if cp & 0b1111100000000000 == 0b1101100000000000
-        cp
-      else
-        temp = cp - 0x10000
-        [(temp >> 10) | 0b1101100000000000, (temp & 0b0000001111111111) | 0b1101110000000000]
-      end
-    }
-end
-def cpary2c(array)
-  return "UINT16_MAX" if array.nil? || array.length == 0
-  lencode = array.length - 1 #no sequence has len 0, so we encode len 1 as 0, len 2 as 1, ...
-  array = cpary2utf16encoded(array)
-  if lencode >= 3 #we have only 2 bits for the length
-    array = [lencode] + array
-    lencode = 3
-  end
-  idx = pushary(array)
-  raise "Array index out of bound" if idx > 0x3FFF
-  return "#{idx | (lencode << 14)}"
-end
-def singlecpmap(cp)
-  return "UINT16_MAX" if cp == nil
-  idx = pushary(cpary2utf16encoded([cp]))
-  raise "Array index out of bound" if idx > 0xFFFF
-  return "#{idx}"
-end
-
-class UnicodeChar
-  attr_accessor :code, :name, :category, :combining_class, :bidi_class,
-                :decomp_type, :decomp_mapping,
-                :bidi_mirrored,
-                :uppercase_mapping, :lowercase_mapping, :titlecase_mapping,
-                #caches:
-                :c_entry_index, :c_decomp_mapping, :c_case_folding
-  def initialize(line)
-    raise "Could not parse input." unless line =~ /^
-      ([0-9A-F]+);        # code
-      ([^;]+);            # name
-      ([A-Z]+);           # general category
-      ([0-9]+);           # canonical combining class
-      ([A-Z]+);           # bidi class
-      (<([A-Z]*)>)?       # decomposition type
-      ((\ ?[0-9A-F]+)*);  # decompomposition mapping
-      ([0-9]*);           # decimal digit
-      ([0-9]*);           # digit
-      ([^;]*);            # numeric
-      ([YN]*);            # bidi mirrored
-      ([^;]*);            # unicode 1.0 name
-      ([^;]*);            # iso comment
-      ([0-9A-F]*);        # simple uppercase mapping
-      ([0-9A-F]*);        # simple lowercase mapping
-      ([0-9A-F]*)$/ix     # simple titlecase mapping
-    @code              = $1.hex
-    @name              = $2
-    @category          = $3
-    @combining_class   = Integer($4)
-    @bidi_class        = $5
-    @decomp_type       = $7
-    @decomp_mapping    = ($8=='') ? nil :
-                         $8.split.collect { |element| element.hex }
-    @bidi_mirrored     = ($13=='Y') ? true : false
-    # issue #130: use nonstandard uppercase ß -> ẞ
-    # issue #195: if character is uppercase but has no lowercase mapping,
-    #             then make lowercase mapping = itself (vice versa for lowercase)
-    @uppercase_mapping = ($16=='') ? (code==0x00df ? 0x1e9e : ($17=='' && $lowercase.include?(code) ? code : nil)) : $16.hex
-    @lowercase_mapping = ($17=='') ? ($16=='' && $uppercase.include?(code) ? code : nil) : $17.hex
-    @titlecase_mapping = ($18=='') ? (code==0x00df ? 0x1e9e : nil) : $18.hex
-  end
-  def case_folding
-    $case_folding[code]
-  end
-  def c_entry(comb_indicies)
-    "  " <<
-    "{#{str2c category, 'CATEGORY'}, #{combining_class}, " <<
-    "#{str2c bidi_class, 'BIDI_CLASS'}, " <<
-    "#{str2c decomp_type, 'DECOMP_TYPE'}, " <<
-    "#{c_decomp_mapping}, " <<
-    "#{c_case_folding}, " <<
-    "#{singlecpmap uppercase_mapping }, " <<
-    "#{singlecpmap lowercase_mapping }, " <<
-    "#{singlecpmap titlecase_mapping }, " <<
-    "#{comb_indicies[code] ? comb_indicies[code]: 'UINT16_MAX'}, " <<
-    "#{bidi_mirrored}, " <<
-    "#{$exclusions.include?(code) or $excl_version.include?(code)}, " <<
-    "#{$ignorable.include?(code)}, " <<
-    "#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " <<
-    "#{$charwidth[code]}, 0, " <<
-    "#{$grapheme_boundclass[code]}, " <<
-    "#{$icb[code]}},\n"
-  end
-end
-
-chars = []
-char_hash = {}
-
-while gets
-  if $_ =~ /^([0-9A-F]+);<[^;>,]+, First>;/i
-    first = $1.hex
-    gets
-    char = UnicodeChar.new($_)
-    raise "No last character of sequence found." unless
-      $_ =~ /^([0-9A-F]+);<([^;>,]+), Last>;/i
-    last = $1.hex
-    name = "<#{$2}>"
-    for i in first..last
-      char_clone = char.clone
-      char_clone.code = i
-      char_clone.name = name
-      char_hash[char_clone.code] = char_clone
-      chars << char_clone
-    end
-  else
-    char = UnicodeChar.new($_)
-    char_hash[char.code] = char
-    chars << char
-  end
-end
-
-comb1st_indicies = {}
-comb2nd_indicies = {}
-comb2nd_indicies_sorted_keys = []
-comb2nd_indicies_nonbasic = {}
-comb_array = []
-
-chars.each do |char|
-  if !char.nil? and char.decomp_type.nil? and char.decomp_mapping and
-      char.decomp_mapping.length == 2 and !char_hash[char.decomp_mapping[0]].nil? and
-      char_hash[char.decomp_mapping[0]].combining_class == 0 and
-      not $exclusions.include?(char.code)
-
-    dm0 = char.decomp_mapping[0]
-    dm1 = char.decomp_mapping[1]
-    unless comb1st_indicies[dm0]
-      comb1st_indicies[dm0] = comb1st_indicies.keys.length
-    end
-    unless comb2nd_indicies[dm1]
-      comb2nd_indicies_sorted_keys << dm1
-      comb2nd_indicies[dm1] = comb2nd_indicies.keys.length
-    end
-    comb_array[comb1st_indicies[dm0]] ||= []
-    raise "Duplicate canonical mapping: #{char.code} #{dm0} #{dm1}" if comb_array[comb1st_indicies[dm0]][comb2nd_indicies[dm1]]
-    comb_array[comb1st_indicies[dm0]][comb2nd_indicies[dm1]] = char.code
-
-    comb2nd_indicies_nonbasic[dm1] = true if char.code > 0xFFFF
-  end
-  char.c_decomp_mapping = cpary2c(char.decomp_mapping)
-  char.c_case_folding = cpary2c(char.case_folding)
-end
-
-comb_indicies = {}
-cumoffset = 0
-comb1st_indicies_lastoffsets = []
-comb1st_indicies_firstoffsets = []
-comb1st_indicies.each do |dm0, index|
-  first = nil
-  last = nil
-  offset = 0
-  comb2nd_indicies_sorted_keys.each_with_index do |dm1, b|
-    if comb_array[index][b]
-      first = offset unless first
-      last = offset
-      last += 1 if comb2nd_indicies_nonbasic[dm1]
-    end
-    offset += 1
-    offset += 1 if comb2nd_indicies_nonbasic[dm1]
-  end
-  comb1st_indicies_firstoffsets[index] = first
-  comb1st_indicies_lastoffsets[index] = last
-  raise "double index" if comb_indicies[dm0]
-  comb_indicies[dm0] = cumoffset
-  cumoffset += last - first + 1 + 2
-end
-
-offset = 0
-comb2nd_indicies_sorted_keys.each do |dm1|
-  raise "double index" if comb_indicies[dm1]
-  comb_indicies[dm1] = 0x8000 | (comb2nd_indicies[dm1] + offset)
-  raise "too large comb index" if  comb2nd_indicies[dm1] + offset > 0x4000
-  if comb2nd_indicies_nonbasic[dm1]
-    comb_indicies[dm1] = comb_indicies[dm1] | 0x4000
-    offset += 1
-  end
-end
-
-properties_indicies = {}
-properties = []
-chars.each do |char|
-  c_entry = char.c_entry(comb_indicies)
-  char.c_entry_index = properties_indicies[c_entry]
-  unless char.c_entry_index
-    properties_indicies[c_entry] = properties.length
-    char.c_entry_index = properties.length
-    properties << c_entry
-  end
-end
-
-stage1 = []
-stage2 = []
-for code in 0...0x110000
-  next unless code % 0x100 == 0
-  stage2_entry = []
-  for code2 in code...(code+0x100)
-    if char_hash[code2]
-      stage2_entry << (char_hash[code2].c_entry_index + 1)
-    else
-      stage2_entry << 0
-    end
-  end
-  old_index = stage2.index(stage2_entry)
-  if old_index
-    stage1 << (old_index * 0x100)
-  else
-    stage1 << (stage2.length * 0x100)
-    stage2 << stage2_entry
-  end
-end
-
-$stdout << "static const utf8proc_uint16_t utf8proc_sequences[] = {\n  "
-i = 0
-$int_array.each do |entry|
-  i += 1
-  if i == 8
-    i = 0
-    $stdout << "\n  "
-  end
-  $stdout << entry << ", "
-end
-$stdout << "};\n\n"
-
-$stdout << "static const utf8proc_uint16_t utf8proc_stage1table[] = {\n  "
-i = 0
-stage1.each do |entry|
-  i += 1
-  if i == 8
-    i = 0
-    $stdout << "\n  "
-  end
-  $stdout << entry << ", "
-end
-$stdout << "};\n\n"
-
-$stdout << "static const utf8proc_uint16_t utf8proc_stage2table[] = {\n  "
-i = 0
-stage2.flatten.each do |entry|
-  i += 1
-  if i == 8
-    i = 0
-    $stdout << "\n  "
-  end
-  $stdout << entry << ", "
-end
-$stdout << "};\n\n"
-
-$stdout << "static const utf8proc_property_t utf8proc_properties[] = {\n"
-$stdout << "  {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX,  false,false,false,false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},\n"
-properties.each { |line|
-  $stdout << line
-}
-$stdout << "};\n\n"
-
-
-
-$stdout << "static const utf8proc_uint16_t utf8proc_combinations[] = {\n  "
-i = 0
-comb1st_indicies.keys.each_index do |a|
-  offset = 0
-  $stdout << comb1st_indicies_firstoffsets[a] << ", " << comb1st_indicies_lastoffsets[a] << ", "
-  comb2nd_indicies_sorted_keys.each_with_index do |dm1, b|
-    break if offset > comb1st_indicies_lastoffsets[a]
-    if offset >= comb1st_indicies_firstoffsets[a]
-      i += 1
-      if i == 8
-        i = 0
-        $stdout << "\n  "
-      end
-      v = comb_array[a][b] ? comb_array[a][b] : 0
-      $stdout << (( v & 0xFFFF0000 ) >> 16) << ", " if comb2nd_indicies_nonbasic[dm1]
-      $stdout << (v & 0xFFFF) << ", "
-    end
-    offset += 1
-    offset += 1 if comb2nd_indicies_nonbasic[dm1]
-  end
-  $stdout  << "\n"
-end
-$stdout << "};\n\n"
author	Claire Foster <chris42f@gmail.com>
	Wed, 3 Jan 2024 17:02:08 +0000 (03:02 +1000)
committer	GitHub <noreply@github.com>
	Wed, 3 Jan 2024 17:02:08 +0000 (12:02 -0500)
Makefile		patch \| blob \| history
data/Makefile		patch \| blob \| history
data/Manifest.toml	[new file with mode: 0644]	patch \| blob
data/Project.toml	[new file with mode: 0644]	patch \| blob
data/charwidths.jl	[deleted file]	patch \| blob \| history
data/data_generator.jl	[new file with mode: 0644]	patch \| blob
data/data_generator.rb	[deleted file]	patch \| blob \| history